load(here::here("data", "working", "model_objects", "basic_model_bayes_no_interact.RData"))
load(here::here("data", "working", "model_objects", "basic_model_bayes_yes_interact.RData"))
load(here::here("data", "working", "model_objects", "basic_model_freq_no_interact.RData"))
load(here::here("data", "working", "model_objects", "basic_model_freq_yes_interact.RData"))
load(here::here("data", "working", "model_objects", "neighbor_model_bayes_no_interact.RData"))
load(here::here("data", "working", "model_objects", "neighbor_model_bayes_yes_interact.RData"))
library(rstanarm)
## Loading required package: Rcpp
## rstanarm (Version 2.19.3, packaged: 2020-02-11 05:16:41 UTC)
## - Do not expect the default priors to remain the same in future rstanarm versions.
## Thus, R scripts should specify priors explicitly, even if they are just the defaults.
## - For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores())
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
library(bayesplot)
## This is bayesplot version 1.7.2
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(sf)
## Linking to GEOS 3.7.2, GDAL 2.4.0, PROJ 6.1.1
library(broom)
library(purrr)
library(glue)
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
library(stringr)
library(viridis)
## Loading required package: viridisLite
library(spdep)
## Loading required package: sp
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(ape)
## Registered S3 method overwritten by 'ape':
## method from
## plot.mst spdep
prepared_data <- readr::read_csv(here::here("data", "final", "response_time_model_data_prepared.csv"))
## Parsed with column specification:
## cols(
## response_time_hundreths_of_minutes = col_double(),
## patient_age = col_double(),
## patient_gender = col_character(),
## response_vehicle_type_collapsed = col_character(),
## after_covid = col_logical(),
## impression_category = col_character(),
## possible_impression_category_collapsed = col_character(),
## patient_first_race_collapsed = col_character(),
## time_of_day = col_double(),
## scene_gps_latitude = col_double(),
## scene_gps_longitude = col_double(),
## incident_dispatch_notified_to_unit_arrived_on_scene_in_minutes = col_double(),
## log_trans_dispatch_time = col_double()
## )
prepared_data_sp <- sf::st_read(here::here("data", "final", "response_time_model_data_prepared_sp.geojson"))
## Reading layer `response_time_model_data_prepared_sp' from data source `/project/class/bii_sdad_dspg/uva/charlottesville-ems_equity/final/response_time_model_data_prepared_sp.geojson' using driver `GeoJSON'
## Simple feature collection with 51544 features and 15 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -78.83887 ymin: 37.72275 xmax: -78.20938 ymax: 38.27793
## CRS: 4326
freq_models <- list("basic_model_freq_no_interact" = basic_model_freq_no_interact,
"basic_model_freq_yes_interact" = basic_model_freq_yes_interact)
bayes_models <- stanreg_list("basic_model_bayes_no_interact" = basic_model_bayes_no_interact,
"basic_model_bayes_yes_interact" = basic_model_bayes_yes_interact,
"neighbor_model_bayes_no_interact" = neighbor_model_bayes_no_interact,
"neighbor_model_bayes_yes_interact" = neighbor_model_bayes_yes_interact)
Setup everything
freq_model_coefs <- map(freq_models, tidy,
conf.int = TRUE,
conf.level = 0.95,
exponentiate = FALSE)
bayes_model_coefs <- map(bayes_models, ~tidy(.x$stanfit,
estimate.method = "median",
conf.int = TRUE,
conf.level = 0.95)) %>%
map(~filter(.x, !(term %in% c("sigma", "mean_PPD", "log-posterior"))))
freq_model_coefs_trans <- freq_model_coefs %>%
map(~mutate(.x, across(c(estimate,
conf.low,
conf.high),
list(scale_factor = ~exp(.x * (.x != .x[1])), # scales to exponential, expression insures intercept is right
time_to_incident = ~exp(.x * (.x != .x[1]) + .x[1]))))) # scales to exponential, expression insures intercept is right
bayes_model_coefs_trans <- bayes_model_coefs %>%
map(~mutate(.x, across(c(estimate,
conf.low,
conf.high),
list(scale_factor = ~exp(.x * (.x != .x[1])),
time_to_incident = ~exp(.x * (.x != .x[1]) + .x[1])))))
plot_scale_factor <- function(data) {
ggplot(data, aes(y = term)) +
geom_point(aes(x = estimate_scale_factor)) +
geom_errorbar(aes(xmin = conf.low_scale_factor, xmax = conf.high_scale_factor)) +
theme_minimal() +
labs(x = NULL, y = glue("Scale Factor from {round(data$estimate_time_to_incident[1],
digits = 1)} Minutes")) +
coord_cartesian(xlim = c(0.25, 1.75)) +
geom_vline(xintercept = 1, alpha = 0.5)
}
plot_time_to_incident <- function(data) {
ggplot(data) +
geom_point(aes(x = term, y = estimate_time_to_incident)) +
geom_errorbar(aes(x = term, ymin = conf.low_time_to_incident, ymax = conf.high_time_to_incident)) +
theme_minimal() +
coord_flip() +
labs(x = NULL, y = glue("Travel Time in Minutes"))
}
Actually make the plots:
map(bayes_model_coefs_trans,
plot_scale_factor)
## $basic_model_bayes_no_interact
##
## $basic_model_bayes_yes_interact
##
## $neighbor_model_bayes_no_interact
##
## $neighbor_model_bayes_yes_interact
map(bayes_model_coefs_trans,
plot_time_to_incident)
## $basic_model_bayes_no_interact
##
## $basic_model_bayes_yes_interact
##
## $neighbor_model_bayes_no_interact
##
## $neighbor_model_bayes_yes_interact
Looks like the after covid interaction term doesn't do a lot in the non neighborhood models. In the neighborhood models however, it looks like times consistently increase.
Now for frequentist models:
map(freq_model_coefs_trans,
plot_scale_factor)
## $basic_model_freq_no_interact
##
## $basic_model_freq_yes_interact
map(freq_model_coefs_trans,
plot_time_to_incident)
## $basic_model_freq_no_interact
##
## $basic_model_freq_yes_interact
Basically identical to the bayesian models
cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
bayes_model_coefs_trans[[2]] %>%
mutate(after_covid = str_detect(term, "(after_covidTRUE.*)")) %>%
mutate(term = str_replace(term, "after_covidTRUE:", "")) %>%
ggplot(aes(y = term, color = after_covid)) +
geom_point(aes(x = estimate_scale_factor)) +
geom_errorbar(aes(xmin = conf.low_scale_factor, xmax = conf.high_scale_factor)) +
theme_minimal() +
scale_color_manual(values = cbbPalette) +
coord_cartesian(xlim = c(0.5, 1.5)) +
geom_vline(xintercept = 1, alpha = 0.5) +
labs(y = NULL, x = glue("Scale Factor from {round(bayes_model_coefs_trans[[2]]$estimate_time_to_incident[1],
digits = 1)} Minutes"),
color = "During Covid")
bayes_model_coefs_trans[[4]] %>%
mutate(after_covid = str_detect(term, "(after_covidTRUE.*)")) %>%
mutate(term = str_replace(term, "after_covidTRUE:", "")) %>%
ggplot(aes(y = term, color = after_covid)) +
geom_point(aes(x = estimate_scale_factor)) +
geom_errorbar(aes(xmin = conf.low_scale_factor, xmax = conf.high_scale_factor)) +
theme_minimal() +
scale_color_manual(values = cbbPalette) +
coord_cartesian(xlim = c(0.5, 1.5)) +
geom_vline(xintercept = 1, alpha = 0.5) +
labs(y = NULL, x = glue("Scale Factor from {round(bayes_model_coefs_trans[[4]]$estimate_time_to_incident[1],
digits = 1)} Minutes"),
color = "During Covid")
neighborhood_coefs <- map(bayes_model_coefs_trans, ~.x %>%
filter(str_detect(term, r"(b\[)")) %>%
mutate(NAME = str_extract(term, r"((?<=NAME:).*(?=\]))")) %>%
mutate(NAME = str_replace_all(NAME, "_", " ")))[3:4]
neighborhood_coefs_sp <- map(neighborhood_coefs, ~left_join(prepared_data_sp %>%
group_by(NAME) %>%
slice_head(1) %>%
ungroup(), .x, by = "NAME"))
neighborhood_coefs_sp$neighbor_model_bayes_no_interact %>%
ggplot() +
geom_sf(aes(fill = estimate_scale_factor))
I will only be examining bayesian fits because they are almost identical to the frequentist fits.
First we'll look at the loo statistics
(loo_list <- map(bayes_models, loo))
## Warning: Found 1 observation(s) with a pareto_k > 0.7. We recommend calling 'loo' again with argument 'k_threshold = 0.7' in order to calculate the ELPD without the assumption that these observations are negligible. This will refit the model 1 times to compute the ELPDs for the problematic observations directly.
## Warning: Found 1 observation(s) with a pareto_k > 0.7. We recommend calling 'loo' again with argument 'k_threshold = 0.7' in order to calculate the ELPD without the assumption that these observations are negligible. This will refit the model 1 times to compute the ELPDs for the problematic observations directly.
## $basic_model_bayes_no_interact
##
## Computed from 10000 by 51544 log-likelihood matrix
##
## Estimate SE
## elpd_loo -31745.5 293.0
## p_loo 25.2 0.7
## looic 63491.0 586.0
## ------
## Monte Carlo SE of elpd_loo is 0.0.
##
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.
##
## $basic_model_bayes_yes_interact
##
## Computed from 25000 by 51544 log-likelihood matrix
##
## Estimate SE
## elpd_loo -31755.8 292.8
## p_loo 45.5 1.5
## looic 63511.7 585.6
## ------
## Monte Carlo SE of elpd_loo is NA.
##
## Pareto k diagnostic values:
## Count Pct. Min. n_eff
## (-Inf, 0.5] (good) 51543 100.0% 8953
## (0.5, 0.7] (ok) 0 0.0% <NA>
## (0.7, 1] (bad) 1 0.0% 55
## (1, Inf) (very bad) 0 0.0% <NA>
## See help('pareto-k-diagnostic') for details.
##
## $neighbor_model_bayes_no_interact
##
## Computed from 25000 by 51544 log-likelihood matrix
##
## Estimate SE
## elpd_loo -24799.8 372.3
## p_loo 65.2 1.9
## looic 49599.5 744.5
## ------
## Monte Carlo SE of elpd_loo is 0.0.
##
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.
##
## $neighbor_model_bayes_yes_interact
##
## Computed from 25000 by 51544 log-likelihood matrix
##
## Estimate SE
## elpd_loo -24807.8 372.0
## p_loo 85.1 2.3
## looic 49615.5 744.1
## ------
## Monte Carlo SE of elpd_loo is NA.
##
## Pareto k diagnostic values:
## Count Pct. Min. n_eff
## (-Inf, 0.5] (good) 51543 100.0% 5851
## (0.5, 0.7] (ok) 0 0.0% <NA>
## (0.7, 1] (bad) 1 0.0% 100
## (1, Inf) (very bad) 0 0.0% <NA>
## See help('pareto-k-diagnostic') for details.
loo_compare(loo_list)
## elpd_diff se_diff
## neighbor_model_bayes_no_interact 0.0 0.0
## neighbor_model_bayes_yes_interact -8.0 5.3
## basic_model_bayes_no_interact -6945.8 149.6
## basic_model_bayes_yes_interact -6956.1 149.8
Looks like the neighborhood level models do a lot! better, and they all have reasonable specifications
map(bayes_models, pp_check)
## $basic_model_bayes_no_interact
##
## $basic_model_bayes_yes_interact
##
## $neighbor_model_bayes_no_interact
##
## $neighbor_model_bayes_yes_interact
The capture the majority of model variation, the remaining portion is likely due to it not being truly normal. Maybe a proper spatial mixed effects model could solve that?
Lets now check the residuals on a map:
bayes_residuals <- map(bayes_models, residuals)
augmented_data <- prepared_data_sp %>%
mutate(resid_basic_no = bayes_residuals$basic_model_bayes_no_interact,
resid_basic_yes = bayes_residuals$basic_model_bayes_yes_interact,
resid_neighbor_no = bayes_residuals$neighbor_model_bayes_no_interact,
resid_neighbor_yes = bayes_residuals$neighbor_model_bayes_yes_interact)
Basic Linear Model with No Interaction Term
augmented_data %>%
ggplot() +
stat_summary_hex(aes(x = scene_gps_longitude,
y = scene_gps_latitude,
z = resid_basic_no)) +
geom_sf(fill = NA,
color = "#444444",
alpha = 0.3,
size = 0.1) +
scale_fill_viridis() +
theme_minimal() +
labs(x = "Longitude",
y = "Latitude",
fill = "Mean Residuals",
title = "Basic Linear Model, No Interactions")
Basic Linear Model with Interaction Term
augmented_data %>%
ggplot() +
stat_summary_hex(aes(x = scene_gps_longitude,
y = scene_gps_latitude,
z = resid_basic_yes)) +
geom_sf(fill = NA,
color = "#444444",
alpha = 0.3,
size = 0.1) +
scale_fill_viridis() +
theme_minimal() +
labs(x = "Longitude",
y = "Latitude",
fill = "Mean Residuals",
title = "Basic Linear Model, Interactions")
Neighborhood Model with No Interaction Term
augmented_data %>%
ggplot() +
stat_summary_hex(aes(x = scene_gps_longitude,
y = scene_gps_latitude,
z = resid_neighbor_no)) +
geom_sf(fill = NA,
color = "#444444",
alpha = 0.3,
size = 0.1) +
scale_fill_viridis() +
theme_minimal() +
labs(x = "Longitude",
y = "Latitude",
fill = "Mean Residuals",
title = "Neighborhood Model, No Interactions")
Neighborhood Model with Interaction Term
augmented_data %>%
ggplot() +
stat_summary_hex(aes(x = scene_gps_longitude,
y = scene_gps_latitude,
z = resid_neighbor_yes)) +
geom_sf(fill = NA,
color = "#444444",
alpha = 0.3,
size = 0.1) +
scale_fill_viridis() +
theme_minimal() +
labs(x = "Longitude",
y = "Latitude",
fill = "Mean Residuals",
title = "Neighborhood Model, Interactions")
set.seed(451)
augmented_data_sampled <- augmented_data %>%
sample_n(2000)
distances <- as.matrix(dist(cbind(augmented_data_sampled$scene_gps_longitude, augmented_data_sampled$scene_gps_latitude)))
distances_inv <- 1 / distances
diag(distances) <- 0
distances_inv[is.infinite(distances_inv)] <- 0
First I'll use ape for global Moran's I statistics
ape::Moran.I(augmented_data_sampled$resid_basic_no, distances_inv)
## $observed
## [1] 0.06648128
##
## $expected
## [1] -0.0005002501
##
## $sd
## [1] 0.004039064
##
## $p.value
## [1] 0
ape::Moran.I(augmented_data_sampled$resid_basic_yes, distances_inv)
## $observed
## [1] 0.06661546
##
## $expected
## [1] -0.0005002501
##
## $sd
## [1] 0.004039059
##
## $p.value
## [1] 0
ape::Moran.I(augmented_data_sampled$resid_neighbor_no, distances_inv)
## $observed
## [1] 0.02011118
##
## $expected
## [1] -0.0005002501
##
## $sd
## [1] 0.004036932
##
## $p.value
## [1] 3.295459e-07
ape::Moran.I(augmented_data_sampled$resid_neighbor_yes, distances_inv)
## $observed
## [1] 0.02022165
##
## $expected
## [1] -0.0005002501
##
## $sd
## [1] 0.00403693
##
## $p.value
## [1] 2.850324e-07
Next we'll check for local spatial autocorrelation using spdep
morans_list <- augmented_data_sampled %>%
st_drop_geometry() %>%
select(resid_basic_no:resid_neighbor_yes) %>%
map(~localmoran(.x, mat2listw(distances_inv)))
Lets plot the z-scores
map2(morans_list, names(morans_list), function(moran_info, variable_name) {
z_score <- moran_info[,4]
augmented_data_sampled %>%
ggplot() +
stat_summary_hex(aes(x = scene_gps_longitude,
y = scene_gps_latitude,
z = z_score)) +
geom_sf(fill = NA,
color = "#444444",
alpha = 0.3,
size = 0.1) +
theme_minimal() +
labs(x = "Longitude",
y = "Latitude",
fill = "Mean Z-Score",
title = variable_name) +
scale_fill_gradient2()
})
## $resid_basic_no
##
## $resid_basic_yes
##
## $resid_neighbor_no
##
## $resid_neighbor_yes
Looks like when neighborhood terms are included, spatial autocorrelation completly dissapears. This is robust across several samples, but only one is shown here due to computational issues.